The following object is masked from 'package:ggplot2':
last_plot
The following object is masked from 'package:stats':
filter
The following object is masked from 'package:graphics':
layout
library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:data.table':
between, first, last
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
Problem 1
In this question, the final output will be a string like “1/6”. For an integer such as 4, it will be shown as “4/1”.
Define functions of GCD and LCM
I can not use the Rcpp to create two functions at the same time for some reason I can not clarify. So I create a .cpp file to store these two functions. And the following is the code in the file named GCD_LCM.cpp.
# Define function of GCD and LCM# This part is based on ChatGPT.# #include <Rcpp.h># using namespace Rcpp;# # // GCD# int GCD(int a, int b) {# if (b == 0)# return a;# return GCD(b, a % b); # }# # // LCM# int LCM(int a, int b) {# return (a * b) / GCD(a, b);# }# # // [[Rcpp::export]]# int gcd(int a, int b) {# return GCD(a, b);# }# # // [[Rcpp::export]]# int lcm(int a, int b) {# return LCM(a, b);# }
# import the file to apply gcd and lcm functionsourceCpp("gcd_lcm.cpp")
Create class
# 1. Create the 'rational' class where numerator and denominator are integerssetClass("rational", slots =c(numerator ="numeric", denominator ="numeric"))# Constructor for rational classrational <-function(numerator, denominator) {new("rational", numerator = numerator, denominator = denominator)}
If there is an error, I will show it instead of stop here since it is required not to stop on errors.
# 2. Validity funtion # check if inputs are integers and if denominator is eauql to 0validity <-function(object) {# check if there is missing valuesif(any(is.na(object@numerator), is.na(object@denominator))) {return("Numerator and Denominator must be non-NA integers") }# check if numerator and denominator are integarsif (object@numerator !=as.integer(object@numerator)) {return("Numerator should be integers") }if (object@denominator !=as.integer(object@denominator)) {return("Denominator should be integers") }# check if the denominator is 0if(object@denominator ==0) {return("Denominator cannot be zero") }TRUE# if there is not any error}# set validity function for the classsetValidity("rational", validity)
setMethod("simplify","rational",function(object) {# get common divisor, at least having one of 1 common_divisor <-gcd(object@numerator, object@denominator)# simplify numerator and denominator numerator_simplified <- object@numerator / common_divisor denominator_simplified <- object@denominator / common_divisor# input new object to class rationalrational(as.integer(numerator_simplified), as.integer(denominator_simplified)) })
setMethod("quotient","rational",function(object, digits =3) {# the real value result <-as.numeric(object@numerator) /as.numeric(object@denominator) # print the value with certain digitsprint(format(result, digits = digits)) return(result) })
For the addition of fractions, the least common multiple (LCM) of the denominators should be found first. The new denominator is the LCM, and the new numerator is the original numerator multiplied by the LCM divided by the original denominator. Then, add the new numerators of the two fractions together, and finally simplify the result.
# 6.Define method of +, -, x, /# 6.1 +setMethod("+",signature(e1 ="rational", e2 ="rational"),function(e1, e2) {# lcm and new denominator at the same time lcm_denominator <-lcm(e1@denominator, e2@denominator)# new numerators numerator1_new <- e1@numerator * (lcm_denominator / e1@denominator) numerator2_new <- e2@numerator * (lcm_denominator / e2@denominator)# the sum of new numerators sum_numerator <- numerator1_new + numerator2_new# simplify the resultsimplify(rational(sum_numerator, lcm_denominator)) })
For the subtraction of fractions, the least common multiple (LCM) of the denominators should be found first. The new denominator is the LCM, and the new numerator is the original numerator multiplied by the LCM divided by the original denominator. Then, subtract the new numerators of the two fractions, and finally simplify the result.
# 6.2 -setMethod("-",signature(e1 ="rational", e2 ="rational"),function(e1, e2) {# lcm and new denominator at the same time lcm_denominator <-lcm(e1@denominator, e2@denominator)# new numerators numerator1_new <- e1@numerator * (lcm_denominator / e1@denominator) numerator2_new <- e2@numerator * (lcm_denominator / e2@denominator)# the difference of new numerators diff_numerator <- numerator1_new - numerator2_new# simplify the resultsimplify(rational(diff_numerator, lcm_denominator)) })
For the multiplication of fractions, multiply the numerators and denominators respectively, and then simplify the result.
# check if the validator does not allow the creation of rational’s with 0 denominatorrational(1, 0)
Error in validObject(.Object): invalid class "rational" object: Denominator cannot be zero
# check other malformed input# 1. if numerator or denominator is not an integerrational(1.4, 2)
Error in validObject(.Object): invalid class "rational" object: Numerator should be integers
rational(1, 2.2)
Error in validObject(.Object): invalid class "rational" object: Denominator should be integers
rational(1, 's')
Error in validObject(.Object): invalid class "rational" object: invalid object for slot "denominator" in class "rational": got class "character", should be or extend class "numeric"
# 2. check if there is a missing valuerational(1, NA)
Error in validObject(.Object): invalid class "rational" object: invalid object for slot "denominator" in class "rational": got class "logical", should be or extend class "numeric"
Problem 2
art <-read.csv("./df_for_ml_improved_new_market.csv")# clean the genreart$Genre___Others[art$Genre___Painting ==1] <-0art$genre <-"Photography"art$genre[art$Genre___Print ==1] <-"Print"art$genre[art$Genre___Sculpture ==1] <-"Sculpture"art$genre[art$Genre___Painting ==1] <-"Painting"art$genre[art$Genre___Others ==1] <-"Other"table(art$genre)
Other Painting Photography Print Sculpture
27 519 1746 414 1641
2.a
In part (a), I use the plot in the file of problem4_solution.
# define the top_values##' @title Subset a vector to values above some percentile##' @param vec A vector of values##' @param percentile A percentile to identifyselect_top_values <-function(vec, percentile) { val <-quantile(vec, percentile)return(vec[vec > val])}# Subset a vector to values above 95 percentilesave <-list()for (y inunique(art$year)) { prices <- art[art$year == y, "price_usd"] save[[as.character(y)]] <-data.frame(year = y,price_usd =select_top_values(prices, .95))}# We've got a list, use `do.call` to combine them all togetherarttop <-do.call(rbind, save)
# define the median valueartmedian <-aggregate(art$price_usd, by =list(art$year),FUN = median, na.rm =TRUE)names(artmedian) <-c("year", "price_usd")# convert the format to factorarttop$year <-as.factor(arttop$year)artmedian$year <-as.factor(artmedian$year)
# Create the box plot for the top 5% pricesp_a <-plot_ly(data = arttop, x =~year, y =~price_usd, type ='box', marker =list(outliercolor ="darkred", symbol ="o", size =4),name ="Top 5% Prices") # Add the median pricesp_a <-add_trace(p_a, data = artmedian, x =~year, y =~price_usd, type ='scatter', mode ='lines+markers',line =list(dash ='dash', width =1.2),name ="Median Prices")# Configure the layoutp_a <- p_a %>%layout(title ="Changes in Top 5% of Prices",xaxis =list(title ="Year"),yaxis =list(title ="Price in Million USD",tickvals =seq(0, 1400000, by =200000),ticktext =paste(seq(0, 1.4, 0.2), "M", sep ="")),legend =list(title =list(text =""), x =0.1, y =0.95# Position inside the plot )) p_a
While the median price does not change drastically, we see a large increase in the price for the most expensive sales starting around 2000 until around 2006, at which point it stabilizes.
2.b
# calculate the count of genreyeargenre <-with(art, table(year, genre))# calculate the proportion of genreygperc <- yeargenre/apply(yeargenre, 1, sum)ygperc <- ygperc[, c("Painting", "Sculpture", "Photography", "Print", "Other")]ygpercm <-as.data.frame(ygperc)# Reverse level of factors so ggplot draws it the same wayygpercm$genre <-factor(ygpercm$genre, levels =rev(unique(ygpercm$genre)))head(ygpercm)
# define median by year and genre in part bartmedian_b <-aggregate(art$price_usd, by =list(art$year, art$genre),FUN = median, na.rm =TRUE)names(artmedian_b) <-c("year", "genre", "price_usd")# define 97.5 percentile data by year and genreart975 <-aggregate(art$price_usd, by =list(art$year, art$genre),FUN = quantile, .975, na.rm =TRUE)names(art975) <-c("year", "genre", "price_usd")# the names of all artsgenres <-rev(unique(artmedian$genre))# convert theformat to factorartmedian_b$genre <-factor(artmedian_b$genre, levels =rev(unique(artmedian_b$genre)))art975$genre <-factor(art975$genre, levels =rev(unique(art975$genre)))# bind all the rows and add the label of median or 97.5%artcombine <-bind_rows( artmedian_b %>%mutate(measure ="Median"), art975 %>%mutate(measure ="97.5%"))artcombine$genre <-factor(artcombine$genre, levels =rev(unique(artcombine$genre)))artcombine$year <-factor(artcombine$year)head(artcombine)
year genre price_usd measure
1 2007 Other 6609.0 Median
2 2008 Other 8242.0 Median
3 2009 Other 7179.0 Median
4 2010 Other 9160.0 Median
5 2011 Other 5795.5 Median
6 2012 Other 7520.5 Median
# Part of the following codes are referenced from ChatGPT.fig <-plot_ly()# Add bar chart to the secondary Y-axisfig <- fig %>%add_trace(data = ygpercm, x =~year, y =~Freq, color =~genre, type ='bar', hoverinfo ='text',opacity =0.2, # Retain transparencytext =~paste('Year:', year, '<br>Genre:', genre, '<br>Proportion:', round(Freq*100, 2), '%'), name =~genre,yaxis ='y2',showlegend =TRUE )# Add line chart for price changes to the primary Y-axisfig <- fig %>%add_trace(data = artcombine, x =~year, y =~price_usd, color =~genre, type ='scatter', mode ='lines+markers',hoverinfo ='text',text =~paste('Year:', year, '<br>Genre:', genre, '<br>Price (USD):', price_usd, '<br>Measure:', measure),name =~paste(genre, measure), yaxis ='y', showlegend =TRUE )# Add layout and axis settingsfig <- fig %>%layout(title ="Changes in Price and Proportion by Genre Over Time",xaxis =list(title ='Year'),yaxis =list(title ='Price in Thousands USD', tickvals =seq(0, 400000, by =50000),ticktext =paste0(seq(0, 70, by =10), 'k') ),yaxis2 =list(title ='Proportion (%)', overlaying ='y', side ='right', rangemode ="tozero" ),barmode ='stack', # Set barmode to stacklegend =list(orientation ='v', # Vertical legendx =1.2, # Move legend to the right, outside the chart areay =1, # Align legend to the topxanchor ='left', # Anchor set to left alignmentbgcolor ='rgba(0,0,0,0)'# Transparent background ) )fig
In this chart, the file problem4_solution is referenced. It shows the sales proportion of different art genres across different years, as well as the changes in median price and the 97.5% price over time.
We can draw the conclusion that, over time, painting sales were replaced with photo/print sales as the digital age ramped up.
Photography prices increased the most, both in terms of median and large values. Painting/sculpture/print all saw similar but lesser increases.
Problem 3
3.a
# convert the format to data.tableflights_dt <-as.data.table(flights)airports_dt <-as.data.table(airports)
# the average and median departure delay timedepart_delay_time <- flights_dt[, .(# mean of department delay timemean_delay =mean(dep_delay, na.rm =TRUE), # median of department delay timemed_delay =median(dep_delay, na.rm =TRUE),# the number of flights in every airportnumflights = .N ),# exclude any destination with under 10 flightsby = origin][numflights >=10][# left join the table airports where origin equals faa airports, on = .(origin = faa), nomatch =0][, # filter rows .(name, mean_delay, med_delay)][# order table in descending mean delayorder(-mean_delay)]print(depart_delay_time)
name mean_delay med_delay
<char> <num> <num>
1: Newark Liberty Intl 15.10795 -1
2: John F Kennedy Intl 12.11216 -1
3: La Guardia 10.34688 -3
arrival_delay_time <- flights_dt[, .(# mean of arrival delay timemean_delay =mean(arr_delay, na.rm =TRUE),# median of arrival delay timemed_delay =median(arr_delay, na.rm =TRUE),# the number of flights in every airportnumflights = .N), # exclude any destination with under 10 flightsby = .(dest)][numflights >=10][# left join the table airports where origin equals faa airports, on = .(dest = faa), nomatch =0][, # filter rows .(name, mean_delay, med_delay)][# order table in descending mean delayorder(-mean_delay)]print(arrival_delay_time)
name mean_delay med_delay
<char> <num> <num>
1: Columbia Metropolitan 41.76415094 28.0
2: Tulsa Intl 33.65986395 14.0
3: Will Rogers World 30.61904762 16.0
4: Jackson Hole Airport 28.09523810 15.0
5: Mc Ghee Tyson 24.06920415 2.0
6: Dane Co Rgnl Truax Fld 20.19604317 1.0
7: Richmond Intl 20.11125320 1.0
8: Akron Canton Regional Airport 19.69833729 3.0
9: Des Moines Intl 19.00573614 0.0
10: Gerald R Ford Intl 18.18956044 1.0
11: Birmingham Intl 16.87732342 -2.0
12: Theodore Francis Green State 16.23463687 1.0
13: Greenville-Spartanburg International 15.93544304 -0.5
14: Cincinnati Northern Kentucky Intl 15.36456376 -3.0
15: Savannah Hilton Head Intl 15.12950601 -1.0
16: Manchester Regional Airport 14.78755365 -3.0
17: Eppley Afld 14.69889841 -2.0
18: Yeager 14.67164179 -1.5
19: Kansas City Intl 14.51405836 0.0
20: Albany Intl 14.39712919 -4.0
21: General Mitchell Intl 14.16722038 0.0
22: Piedmont Triad 14.11260054 -2.0
23: Washington Dulles Intl 13.86420212 -3.0
24: Cherry Capital Airport 12.96842105 -10.0
25: James M Cox Dayton Intl 12.68048606 -3.0
26: Louisville International Airport 12.66938406 -2.0
27: Chicago Midway Intl 12.36422360 -1.0
28: Sacramento Intl 12.10992908 4.0
29: Jacksonville Intl 11.84483416 -2.0
30: Nashville Intl 11.81245891 -2.0
31: Portland Intl Jetport 11.66040210 -4.0
32: Greater Rochester Intl 11.56064461 -5.0
33: Hartsfield Jackson Atlanta Intl 11.30011285 -1.0
34: Lambert St Louis Intl 11.07846451 -3.0
35: Norfolk Intl 10.94909344 -4.0
36: Baltimore Washington Intl 10.72673385 -5.0
37: Memphis Intl 10.64531435 -2.5
38: Port Columbus Intl 10.60132291 -3.0
39: Charleston Afb Intl 10.59296847 -4.0
40: Philadelphia Intl 10.12719014 -3.0
41: Raleigh Durham Intl 10.05238095 -3.0
42: Indianapolis Intl 9.94043412 -3.0
43: Charlottesville-Albemarle 9.50000000 -5.0
44: Cleveland Hopkins Intl 9.18161129 -5.0
45: Ronald Reagan Washington Natl 9.06695204 -2.0
46: Burlington Intl 8.95099602 -4.0
47: Buffalo Niagara Intl 8.94595186 -5.0
48: Syracuse Hancock Intl 8.90392501 -5.0
49: Denver Intl 8.60650021 -2.0
50: Palm Beach Intl 8.56297210 -3.0
51: Bob Hope 8.17567568 -3.0
52: Fort Lauderdale Hollywood Intl 8.08212154 -3.0
53: Bangor Intl 8.02793296 -9.0
54: Asheville Regional Airport 8.00383142 -1.0
55: Pittsburgh Intl 7.68099053 -5.0
56: Gallatin Field 7.60000000 -2.0
57: NW Arkansas Regional 7.46572581 -2.0
58: Tampa Intl 7.40852503 -4.0
59: Charlotte Douglas Intl 7.36031885 -3.0
60: Minneapolis St Paul Intl 7.27016886 -5.0
61: William P Hobby 7.17618819 -4.0
62: Bradley Intl 7.04854369 -10.0
63: San Antonio Intl 6.94537178 -9.0
64: South Bend Rgnl 6.50000000 -3.5
65: Louis Armstrong New Orleans Intl 6.49017497 -6.0
66: Key West Intl 6.35294118 7.0
67: Eagle Co Rgnl 6.30434783 -4.0
68: Austin Bergstrom Intl 6.01990875 -5.0
69: Chicago Ohare Intl 5.87661475 -8.0
70: Orlando Intl 5.45464309 -5.0
71: Detroit Metro Wayne Co 5.42996346 -7.0
72: Portland Intl 5.14157973 -5.0
73: Nantucket Mem 4.85227273 -3.0
74: Wilmington Intl 4.63551402 -7.0
75: Myrtle Beach Intl 4.60344828 -13.0
76: Albuquerque International Sunport 4.38188976 -5.5
77: George Bush Intercontinental 4.24079040 -5.0
78: Norman Y Mineta San Jose Intl 3.44817073 -7.0
79: Southwest Florida Intl 3.23814963 -5.0
80: San Diego Intl 3.13916574 -5.0
81: Sarasota Bradenton Intl 3.08243131 -5.0
82: Metropolitan Oakland Intl 3.07766990 -9.0
83: General Edward Lawrence Logan Intl 2.91439222 -9.0
84: San Francisco Intl 2.67289152 -8.0
85: Yampa Valley 2.14285714 2.0
86: Phoenix Sky Harbor Intl 2.09704733 -6.0
87: Montrose Regional Airport 1.78571429 -10.5
88: Los Angeles Intl 0.54711094 -7.0
89: Dallas Fort Worth Intl 0.32212685 -9.0
90: Miami Intl 0.29905978 -9.0
91: Mc Carran Intl 0.25772849 -8.0
92: Salt Lake City Intl 0.17625459 -8.0
93: Long Beach -0.06202723 -10.0
94: Martha\\\\'s Vineyard -0.28571429 -11.0
95: Seattle Tacoma Intl -1.09909910 -11.0
96: Honolulu Intl -1.36519258 -7.0
97: John Wayne Arpt Orange Co -7.86822660 -11.0
98: Palm Springs Intl -12.72222222 -13.5
name mean_delay med_delay
3.b
planes_dt =as.data.table(planes)top_model <- flights_dt[planes_dt, # merge flights and planes on tailnum on = .(tailnum)][, `:=`(# calculate the air time in hourtime = air_time /60, # calculate the mphmph = distance / (air_time /60))][, .(# calculate the mean mphavgmph =mean(mph, na.rm =TRUE),# the number of filghtsnumflights = .N), by = model][order(-avgmph)][1] #order table in descending average mphprint(top_model)
model avgmph numflights
<char> <num> <int>
1: 777-222 482.6254 4